library(Seurat)
library(tidyverse)
library(VennDiagram)
library(clusterProfiler)
library(org.Hs.eg.db) 
dat <- readRDS("pseudobulk/DE_data.Rds")
dat <- SetIdent(dat, value="label")
dat <- NormalizeData(dat)

Evaluation DGE algorithms

All the results were filtered by the adj.p-value < 0.05 and logFC >= 0.3

dat
## An object of class Seurat 
## 15327 features across 6961 samples within 1 assay 
## Active assay: RNA (15327 features, 0 variable features)
##  2 dimensional reductions calculated: umap, pca

Single cell methods:

For single cell methods I am going to consider both Seurat and Scanpy

Scanpy

here

For Scanpy I take in account the methods: - t-test_overestim_var - wilcoxon

Unfortunately the logistic regression provided by scanpy does not give LogFoldChanges and will not be taken in account.

scanpy_t = read.csv("scanpy/DEG_adata_t.csv")
scanpy_t$X <- NULL
scanpy_wx <- read.csv("scanpy/DEG_adata_wx.csv")
scanpy_wx$X <- NULL

num_genes <- data.frame(matrix(ncol = 2, nrow = 0))  
colnames(num_genes) <- c("Method", "nGenes")

newrow <- c("scanpy_t", dim(scanpy_t)[1])
num_genes[nrow(num_genes) + 1, ] <- newrow
newrow <- c("scanpy_wx", dim(scanpy_wx)[1])
num_genes[nrow(num_genes) + 1, ] <- newrow

plot_genes <- function(num_genes){
  num_genes$nGenes = as.integer(num_genes$nGenes)
ggplot(num_genes, aes(x=Method, y=nGenes, fill=Method))+
  geom_bar(stat="identity",width = 0.5)+
  geom_text(aes(label=nGenes), vjust=1.6, color="white", size=3.5)+
  theme_minimal()
}
plot_genes(num_genes)

grid.newpage()
venn_object <- venn.diagram(
        x = list(scanpy_t$Symbol, scanpy_wx$Symbol),
        category.names = c("scanpy_t", "scanpy_wx"),
        filename = NULL
        )
grid.draw(venn_object)

The t-test_overestim_var recognized as significative almost all the genes and include almost all the genes from the wilcoxon test. Nolw let’s evaluate the expression of the genes.

Scanpy T-test

Top upregulated genes in Erythrocytes:

plot_top_genes <- function(data, column){
  max_lfc <- data[order(-data[column]),] %>%
  head(3) 

  max_lfc <- max_lfc$Symbol

  RidgePlot(dat, features = max_lfc)
}

plot_top_genes(scanpy_t,"logfoldchanges")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.0665
## Picking joint bandwidth of 0.0505
## Picking joint bandwidth of 0.0478

Top downregulated genes in Erythrocytes:

plot_bottom_genes <- function(data, column){
  max_lfc <- data[order(-data[column]),] %>%
  tail(3) 

  max_lfc <- max_lfc$Symbol

  RidgePlot(dat, features = max_lfc)
}

plot_bottom_genes(scanpy_t,"logfoldchanges")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.0478
## Picking joint bandwidth of 0.0485
## Picking joint bandwidth of 0.069

Genes with lower abs(logFC)

plot_low_genes <- function(data,column){
  max_lfc <- data[order(-data[column]),] %>%
  tail(3) 

  max_lfc <- max_lfc$Symbol

  RidgePlot(dat, features = max_lfc)
}

plot_low_genes(scanpy_t,"abs_lfc")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.144
## Picking joint bandwidth of 1.98e-06
## Picking joint bandwidth of 1.98e-06

The result’s in terms of LogFC looks really odd. Let’s see the values for this genes:

scanpy_t[scanpy_t$Symbol %in% c("KRT1","ARG1", "LINC00570", "CD72", "ARPP21", "MME"),]
##          Symbol     pvals_adj logfoldchanges  abs_lfc    scores
## 68         KRT1  0.000000e+00       29.73123 29.73123  48.64429
## 113   LINC00570 3.816327e-227       28.53994 28.53994  35.04078
## 131        ARG1 2.312735e-192       28.81118 28.81118  31.80785
## 13375    ARPP21 1.717660e-226      -29.04441 29.04441 -34.57604
## 13480      CD72 1.808806e-242      -28.93440 28.93440 -35.96811
## 13887       MME  0.000000e+00      -29.52419 29.52419 -44.01670
##       Aberrant.erythroid Pro.B.cells
## 68             0.4846416   0.0000000
## 113            0.3146137   0.0000000
## 131            0.2832765   0.0000000
## 13375          0.0000000   0.2891921
## 13480          0.0000000   0.3130016
## 13887          0.0000000   0.4122525

Apparently Scanpy has a bias evaluating the logFC of genes that are completely missing in one of the conditions. But The genes are correctly plotted in the report. So I will check the expression by the z-score instead of the LogFC:

print("Top upregulated genes")
## [1] "Top upregulated genes"
plot_top_genes(scanpy_t,"scores")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.0407
## Picking joint bandwidth of 0.05
## Picking joint bandwidth of 0.089

print("Top downregulated genes")
## [1] "Top downregulated genes"
plot_bottom_genes(scanpy_t,"scores")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.0755
## Picking joint bandwidth of 0.035
## Picking joint bandwidth of 0.0609

print("lowest abs(LFC) genes")
## [1] "lowest abs(LFC) genes"
max_lfc <- scanpy_t[order(-abs(scanpy_t["scores"])),] %>%
  tail(3) 
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
max_lfc <- max_lfc$Symbol

RidgePlot(dat, features = max_lfc)
## Picking joint bandwidth of 1.78e-06
## Picking joint bandwidth of 1.78e-06
## Picking joint bandwidth of 1.78e-06

Better result for the extremes logFC (scores), bt still not clear results for the low scored genes. Indeed those genes are expressed by less than 1% of cells:

scanpy_t[scanpy_t$Symbol %in% max_lfc,]
##         Symbol  pvals_adj logfoldchanges  abs_lfc    scores Aberrant.erythroid
## 816 MIR133A1HG 0.04988015      -2.444078 2.444078 -1.980822       0.0006205399
## 817       UACA 0.04972146      -1.606474 1.606474 -1.982030       0.0009308098
## 818    CEP170B 0.04952603      -2.087353 2.087353 -1.983738       0.0003102699
##     Pro.B.cells
## 816 0.002140182
## 817 0.005082932
## 818 0.003477796
plot_upGOs <- function(data,column){
upgenes <- data[data[column] > 0,] %>%
  dplyr::select(Symbol) %>%
  pull()

upGO <- enrichGO(upgenes, org.Hs.eg.db, keyType = "SYMBOL", ont="BP", pAdjustMethod = "BH", readable="False")

upGO <- simplify(upGO, cutoff=0.7, by="p.adjust", select_fun = min,measure = "Wang")

print("Upregulated GO terms")
dotplot(upGO)
}


plot_downGOs <- function(data,column){
downgenes  <- data[data[column] < 0,] %>%
  dplyr::select(Symbol) %>%
  pull()

downGO <- enrichGO(downgenes, org.Hs.eg.db, keyType = "SYMBOL", ont="BP", pAdjustMethod = "BH", readable="False")

downGO <- simplify(downGO, cutoff=0.7, by="p.adjust", select_fun = min,measure = "Wang")

print("Downregulated GO terms")
dotplot(downGO)
}

plot_upGOs(scanpy_t, "scores")
## [1] "Upregulated GO terms"

plot_downGOs(scanpy_t, "scores")
## [1] "Downregulated GO terms"

Few up terms can be linked to aberrant erythroid cells, down terms looks slightly more specific for the condition of pro0B cells

Scanpy wilcoxon

print("Top upregulated genes")
## [1] "Top upregulated genes"
plot_top_genes(scanpy_wx,"logfoldchanges")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.0665
## Picking joint bandwidth of 0.0505
## Picking joint bandwidth of 0.0478

print("Top downregulated genes")
## [1] "Top downregulated genes"
plot_bottom_genes(scanpy_wx,"logfoldchanges")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.0478
## Picking joint bandwidth of 0.0485
## Picking joint bandwidth of 0.069

print("lowest abs(LFC) genes")
## [1] "lowest abs(LFC) genes"
plot_low_genes(scanpy_wx,"abs_lfc")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 2.51e-06
## Picking joint bandwidth of 0.0327
## Picking joint bandwidth of 1.98e-06

plot_upGOs(scanpy_wx, "scores")
## [1] "Upregulated GO terms"

plot_downGOs(scanpy_wx, "scores")
## [1] "Downregulated GO terms"

Similar result with wilcoxon, slightly better GO terms

Seurat

here

For Seurat I take in account the methods: - t-test - wilcoxon - logistic regression

seurat_t <- read_csv("seurat/t-test_seurat.csv")%>%
  filter(cluster == "Aberrant erythroid")
## Rows: 8142 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): cluster, gene
## dbl (6): p_val, avg_log2FC, pct.1, pct.2, p_val_adj, abs_lfc
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(seurat_t)[names(seurat_t) == 'gene'] <- 'Symbol'

newrow <- c("seurat_t", dim(seurat_t)[1])
num_genes[nrow(num_genes) + 1, ] <- newrow

seurat_wx <- read_csv("seurat/wilcoxon_seurat.csv")%>%
  filter(cluster == "Aberrant erythroid")
## Rows: 8134 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): cluster, gene
## dbl (6): p_val, avg_log2FC, pct.1, pct.2, p_val_adj, abs_lfc
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(seurat_wx)[names(seurat_wx) == 'gene'] <- 'Symbol'

newrow <- c("seurat_wx", dim(seurat_wx)[1])
num_genes[nrow(num_genes) + 1, ] <- newrow

seurat_logreg <- read_csv("seurat/wilcoxon_seurat.csv")%>%
  filter(cluster == "Aberrant erythroid")
## Rows: 8134 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): cluster, gene
## dbl (6): p_val, avg_log2FC, pct.1, pct.2, p_val_adj, abs_lfc
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(seurat_logreg)[names(seurat_logreg) == 'gene'] <- 'Symbol'

newrow <- c("seurat_logreg", dim(seurat_logreg)[1])
num_genes[nrow(num_genes) + 1, ] <- newrow
plot_genes(num_genes)

grid.newpage()
venn_object <- venn.diagram(
        x = list(seurat_t$Symbol, seurat_wx$Symbol, seurat_logreg$Symbol),
        category.names = c("seurat_t", "seurat_wx", "seurat_logreg"),
        filename = NULL
        )
grid.draw(venn_object)

Similar numbers of DE genes were found by Seurat’s different flavours.

Seurat T test

print("Top upregulated genes")
## [1] "Top upregulated genes"
plot_top_genes(seurat_t,"avg_log2FC")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.0407
## Picking joint bandwidth of 0.193
## Picking joint bandwidth of 0.188

print("Top downregulated genes")
## [1] "Top downregulated genes"
plot_bottom_genes(seurat_t,"avg_log2FC")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.12
## Picking joint bandwidth of 0.084
## Picking joint bandwidth of 0.0609

print("lowest abs(LFC) genes")
## [1] "lowest abs(LFC) genes"
plot_low_genes(seurat_t,"abs_lfc")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 2.03e-06
## Picking joint bandwidth of 2.06e-06
## Picking joint bandwidth of 1.95e-06

plot_upGOs(seurat_t, "avg_log2FC")
## [1] "Upregulated GO terms"

plot_downGOs(seurat_t, "avg_log2FC")
## [1] "Downregulated GO terms"

Better result with seurat, the top upregulated genes are strictly correlated with erythrocytes. Only CD74 is downregulated and correlated with immune cells. The low abs(LFC) genes still are not well evaluable. In average the different gene expression is better appreciable

GO terms are also better compared to scanpy

Seurat wilcoxon

print("Top upregulated genes")
## [1] "Top upregulated genes"
plot_top_genes(seurat_wx,"avg_log2FC")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.0407
## Picking joint bandwidth of 0.193
## Picking joint bandwidth of 0.188

print("Top downregulated genes")
## [1] "Top downregulated genes"
plot_bottom_genes(seurat_wx,"avg_log2FC")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.12
## Picking joint bandwidth of 0.084
## Picking joint bandwidth of 0.0609

print("lowest abs(LFC) genes")
## [1] "lowest abs(LFC) genes"
plot_low_genes(seurat_wx,"abs_lfc")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 2.03e-06
## Picking joint bandwidth of 2.06e-06
## Picking joint bandwidth of 1.95e-06

Similar to the previous

Seurat logreg

print("Top upregulated genes")
## [1] "Top upregulated genes"
plot_top_genes(seurat_logreg,"avg_log2FC")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.0407
## Picking joint bandwidth of 0.193
## Picking joint bandwidth of 0.188

print("Top downregulated genes")
## [1] "Top downregulated genes"
plot_bottom_genes(seurat_logreg,"avg_log2FC")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.12
## Picking joint bandwidth of 0.084
## Picking joint bandwidth of 0.0609

print("lowest abs(LFC) genes")
## [1] "lowest abs(LFC) genes"
plot_low_genes(seurat_logreg,"abs_lfc")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 2.03e-06
## Picking joint bandwidth of 2.06e-06
## Picking joint bandwidth of 1.95e-06

As above.

Now let’s see the genes fuond uniquely by logreg/wx and t-test:

Unique in wx/logeg:

unique_wx = setdiff(seurat_logreg$Symbol,seurat_t$Symbol)
seurat_logreg[seurat_logreg$Symbol %in% unique_wx,]
## # A tibble: 1 × 8
##      p_val avg_log2FC pct.1 pct.2   p_val_adj cluster            Symbol abs_lfc
##      <dbl>      <dbl> <dbl> <dbl>       <dbl> <chr>              <chr>    <dbl>
## 1 1.55e-11      0.319 0.289 0.407 0.000000237 Aberrant erythroid NEAT1    0.319
RidgePlot(dat, features = unique_wx)
## Picking joint bandwidth of 0.139

VlnPlot(dat, features = unique_wx, group.by = "bioprojects", split.by = "label")
## The default behaviour of split.by has changed.
## Separate violin plots are now plotted side-by-side.
## To restore the old behaviour of a single split violin,
## set split.plot = TRUE.
##       
## This message will be shown once per session.
## Warning: Groups with fewer than two data points have been dropped.

NEAT1 result upregulated in Aberrant erythroid cells only with wilcoxon/logreg method. The logFC are quite small (0.32) and the distribution is similar between the cell lines. Looking at the expression divided by the batch, no batch specific differential expression is available.

Unique in t-test:

unique_t = setdiff(seurat_t$Symbol,seurat_logreg$Symbol)
seurat_t[seurat_t$Symbol %in% unique_t,]
## # A tibble: 5 × 8
##      p_val avg_log2FC pct.1 pct.2 p_val_adj cluster            Symbol abs_lfc
##      <dbl>      <dbl> <dbl> <dbl>     <dbl> <chr>              <chr>    <dbl>
## 1 1.11e-15      0.469 0.237 0.216  1.70e-11 Aberrant erythroid RNF19A   0.469
## 2 2.33e- 7      0.432 0.24  0.284  3.58e- 3 Aberrant erythroid MGST3    0.432
## 3 3.87e-12      0.311 0.207 0.175  5.93e- 8 Aberrant erythroid ZNF451   0.311
## 4 2.10e- 6     -0.313 0.398 0.391  3.22e- 2 Aberrant erythroid ODC1     0.313
## 5 2.39e- 7     -0.520 0.483 0.457  3.67e- 3 Aberrant erythroid REXO2    0.520
RidgePlot(dat, features = unique_t)
## Picking joint bandwidth of 3.07e-06
## Picking joint bandwidth of 0.0324
## Picking joint bandwidth of 2.6e-06
## Picking joint bandwidth of 0.13
## Picking joint bandwidth of 0.144

VlnPlot(dat, features = unique_t, group.by = "bioprojects", split.by = "label", ncol = 2)
## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

All the genes doesn’t have a clear difference in the expression among the batches, I will drop the seurat t-test and keep the logreg/wx

Pseudobulk

In the pseudobulks, counts were summed for each celltype and sample, in practice obtaining a column of counts for each celltype and sample. Then, genes that weren’t showing counts in more than 90% of the cells were removed. Also, genes showing expression below 10 counts in all the pseudobulks were removed (pseudobulks are the sum of cells counts. showing less than 10 counts indicate almost all the cells didn’t had count’s for that gene).

pseudobulk = read_csv("pseudobulk/Pseudobulk_clean.csv")
## Rows: 11505 Columns: 121
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (1): Symbol
## dbl (120): Aberrant_erythroid_MGUS_CD138nCD45p_2, Aberrant_erythroid_MGUS_CD...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dim(pseudobulk)
## [1] 11505   121

DESeq2

For DESeq2, firstly I run it with the “LRT” methods that showed better results here. The PCA plot looks good but the clustering mix some samples here. I run DESeq2 specifing the model to take account of the batches.

Then, I tested combat_seq to correct the batch effect here. Also, I decided to see if there was a batch effect afrom unknown sources using the single variable analysis (sva) here.

deseq2 <- read_csv("pseudobulk/DESeq2.csv") %>%
  mutate(abs_lfc = abs(log2FoldChange))
## Rows: 1945 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): gene
## dbl (6): baseMean, log2FoldChange, lfcSE, stat, pvalue, padj
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(deseq2)[names(deseq2) == 'gene'] <- 'Symbol'


newrow <- c("deseq2", dim(deseq2)[1])
num_genes[nrow(num_genes) + 1, ] <- newrow


deseq2_combat <- read_csv("pseudobulk/DESeq2_combat.csv") %>%
  mutate(abs_lfc = abs(log2FoldChange))
## Rows: 9235 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): gene
## dbl (6): baseMean, log2FoldChange, lfcSE, stat, pvalue, padj
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(deseq2_combat)[names(deseq2_combat) == 'gene'] <- 'Symbol'


newrow <- c("deseq2_combat", dim(deseq2_combat)[1])
num_genes[nrow(num_genes) + 1, ] <- newrow

deseq2_sva <- read_csv("pseudobulk/DESeq2_sva.csv") %>%
  mutate(abs_lfc = abs(log2FoldChange))
## Rows: 4216 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): gene
## dbl (6): baseMean, log2FoldChange, lfcSE, stat, pvalue, padj
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(deseq2_sva)[names(deseq2_sva) == 'gene'] <- 'Symbol'

newrow <- c("deseq2_sva", dim(deseq2_sva)[1])
num_genes[nrow(num_genes) + 1, ] <- newrow
num_genes$nGenes = as.integer(num_genes$nGenes)
plot_genes(num_genes)

grid.newpage()
venn_object <- venn.diagram(
        x = list(deseq2$Symbol, deseq2_combat$Symbol, deseq2_sva$Symbol),
        category.names = c("deseq2", "deseq2_combat","deseq2_sva"),
        filename = NULL
        )
grid.draw(venn_object)

DESeq2 without batch correction found the least number of DE genes compared to the methods tested until now. Also, the DEgenes found with batch corrected methods differs from DESeq2.

DESeq2-LRT

print("Top upregulated genes")
## [1] "Top upregulated genes"
plot_top_genes(deseq2,"log2FoldChange")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.193
## Picking joint bandwidth of 0.188
## Picking joint bandwidth of 0.136

print("Top downregulated genes")
## [1] "Top downregulated genes"
plot_bottom_genes(deseq2,"log2FoldChange")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.0609
## Picking joint bandwidth of 0.0838
## Picking joint bandwidth of 0.103

print("lowest abs(LFC) genes")
## [1] "lowest abs(LFC) genes"
plot_low_genes(deseq2,"abs_lfc")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.175
## Picking joint bandwidth of 0.0557
## Picking joint bandwidth of 0.0568

plot_upGOs(deseq2, "log2FoldChange")
## [1] "Upregulated GO terms"

plot_downGOs(deseq2, "log2FoldChange")
## [1] "Downregulated GO terms"

Genes with high and low logFC are well divided. Also, the genes with low abs(logDC) show better division between the cell types. BP terms aren’t really specific.

DESeq2-combat

print("Top upregulated genes")
## [1] "Top upregulated genes"
plot_top_genes(deseq2_combat,"log2FoldChange")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.0407
## Picking joint bandwidth of 0.193
## Picking joint bandwidth of 0.188

print("Top downregulated genes")
## [1] "Top downregulated genes"
plot_bottom_genes(deseq2_combat,"log2FoldChange")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.106
## Picking joint bandwidth of 0.103
## Picking joint bandwidth of 0.121

print("lowest abs(LFC) genes")
## [1] "lowest abs(LFC) genes"
plot_low_genes(deseq2_combat,"abs_lfc")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.0838
## Picking joint bandwidth of 0.066
## Picking joint bandwidth of 0.0725

plot_upGOs(deseq2_combat, "log2FoldChange")
## [1] "Upregulated GO terms"

plot_downGOs(deseq2_combat, "log2FoldChange")
## [1] "Downregulated GO terms"

Genes with high and low logFC are well divided. Also, the genes with low abs(logDC) show better division between the cell types. BP terms aren’t really specific.

BP terms aren’t really specific. read this for vescicole and autophagy.

DESeq2-sva

print("Top upregulated genes")
## [1] "Top upregulated genes"
plot_top_genes(deseq2_sva,"log2FoldChange")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 1.92e-06
## Picking joint bandwidth of 0.098
## Picking joint bandwidth of 0.136

print("Top downregulated genes")
## [1] "Top downregulated genes"
plot_bottom_genes(deseq2_sva,"log2FoldChange")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 1.95e-06
## Picking joint bandwidth of 0.0277
## Picking joint bandwidth of 1.81e-06

print("lowest abs(LFC) genes")
## [1] "lowest abs(LFC) genes"
plot_low_genes(deseq2_sva,"abs_lfc")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.0452
## Picking joint bandwidth of 0.0349
## Picking joint bandwidth of 2.26e-06

plot_upGOs(deseq2_sva, "log2FoldChange")
## [1] "Upregulated GO terms"

plot_downGOs(deseq2_sva, "log2FoldChange")
## [1] "Downregulated GO terms"

Despite both found ~4k DE genes, only a partial overlap of DE genes between DESeq2 + sva and Seurat. Also, the top lgFC genes have really poor expression.

limma

Normal and with combat

limma limma combat

limma <- read_csv("pseudobulk/limma_voom.csv")
## New names:
## * `` -> ...1
## Rows: 7580 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): ...1
## dbl (7): logFC, AveExpr, t, P.Value, adj.P.Val, B, absFLC
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(limma)[names(limma) == '...1'] <- 'Symbol'

newrow <- c("limma", dim(limma)[1])
num_genes[nrow(num_genes) + 1, ] <- newrow

limma_combat <- read_csv("pseudobulk/limma_combat.csv")
## New names:
## * `` -> ...1
## Rows: 7580 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): ...1
## dbl (7): logFC, AveExpr, t, P.Value, adj.P.Val, B, absFLC
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(limma_combat)[names(limma_combat) == '...1'] <- 'Symbol'

newrow <- c("limma_combat", dim(limma_combat)[1])
num_genes[nrow(num_genes) + 1, ] <- newrow
plot_genes(num_genes)

grid.newpage()
venn_object <- venn.diagram(
        x = list(limma$Symbol, limma_combat$Symbol),
        category.names = c("limma", "limma_combat"),
        filename = NULL
        )
grid.draw(venn_object)

Limma found more than 7k genes, no differences with the batch corrected data

Limma-voom

print("Top upregulated genes")
## [1] "Top upregulated genes"
plot_top_genes(limma,"logFC")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.0407
## Picking joint bandwidth of 0.193
## Picking joint bandwidth of 0.188

print("Top downregulated genes")
## [1] "Top downregulated genes"
plot_bottom_genes(limma,"logFC")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.106
## Picking joint bandwidth of 0.103
## Picking joint bandwidth of 0.0609

print("lowest abs(LFC) genes")
## [1] "lowest abs(LFC) genes"
plot_low_genes(limma,"absFLC")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 2.34e-06
## Picking joint bandwidth of 2.04e-06
## Picking joint bandwidth of 2.2e-06

plot_upGOs(limma, "logFC")
## [1] "Upregulated GO terms"

plot_downGOs(limma, "logFC")
## [1] "Downregulated GO terms"

High logFC are clear, low abs(logFC) aren’t clear Read this and this for cilia

EdgeR

Finally I tested edgeR with LRT as best option from here, I tested combat too. edgeR edgeR_combat

edger <- read_csv("pseudobulk/edgeR.csv")
## New names:
## * `` -> ...1
## Rows: 9415 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): ...1, genes
## dbl (6): logFC, logCPM, LR, PValue, FDR, absLFC
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(edger)[names(edger) == 'genes'] <- 'Symbol'

newrow <- c("edger", dim(edger)[1])
num_genes[nrow(num_genes) + 1, ] <- newrow

edger_combat <- read_csv("pseudobulk/edgeR_combat.csv")
## New names:
## * `` -> ...1
## Rows: 9163 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): ...1, genes
## dbl (6): logFC, logCPM, LR, PValue, FDR, absLFC
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(edger_combat)[names(edger_combat) == 'genes'] <- 'Symbol'

newrow <- c("edger_combat", dim(edger_combat)[1])
num_genes[nrow(num_genes) + 1, ] <- newrow
plot_genes(num_genes)

grid.newpage()
venn_object <- venn.diagram(
        x = list(edger$Symbol, edger_combat$Symbol),
        category.names = c("edgeR", "edgeR_combat"),
        filename = NULL
        )
grid.draw(venn_object)

Both the methods found more than 9k genes with some hundreds uniquel per each method.

EdgeR

print("Top upregulated genes")
## [1] "Top upregulated genes"
plot_top_genes(edger,"logFC")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.098
## Picking joint bandwidth of 0.0407
## Picking joint bandwidth of 2.06e-06

print("Top downregulated genes")
## [1] "Top downregulated genes"
plot_bottom_genes(edger,"logFC")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.0977
## Picking joint bandwidth of 0.108
## Picking joint bandwidth of 0.121

print("lowest abs(LFC) genes")
## [1] "lowest abs(LFC) genes"
plot_low_genes(edger,"absLFC")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.0325
## Picking joint bandwidth of 0.0525
## Picking joint bandwidth of 0.127

plot_upGOs(edger, "logFC")
## [1] "Upregulated GO terms"

plot_downGOs(edger, "logFC")
## [1] "Downregulated GO terms"

Extreme LogFC show different results from what obtained with other algorithms, maybe because of the hand-written method. Low abs(logFC) are not appreciable. BP terms looks good

EdgeR_combat

print("Top upregulated genes")
## [1] "Top upregulated genes"
plot_top_genes(edger_combat,"logFC")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.0407
## Picking joint bandwidth of 0.193
## Picking joint bandwidth of 0.188

print("Top downregulated genes")
## [1] "Top downregulated genes"
plot_bottom_genes(edger_combat,"logFC")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.0977
## Picking joint bandwidth of 0.108
## Picking joint bandwidth of 0.121

print("lowest abs(LFC) genes")
## [1] "lowest abs(LFC) genes"
plot_low_genes(edger_combat,"absLFC")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.0719
## Picking joint bandwidth of 0.0588
## Picking joint bandwidth of 0.0529

plot_upGOs(edger_combat, "logFC")
## [1] "Upregulated GO terms"

plot_downGOs(edger_combat, "logFC")
## [1] "Downregulated GO terms"

High number of DE genes found, the low(logFC) aren’t well distinguishable. GOOD BPs

HBA1/2 bimodal distribution

VlnPlot(dat, features = c("HBA1", "HBA2"), group.by = "bioprojects", split.by = "label", ncol = 2)
## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

dat <- subset(dat, subset = bioprojects == "PRJNA694128")
VlnPlot(dat, features = c("HBA1", "HBA2"), group.by = "batch", split.by = "label", ncol = 1)

RRMM_BM_8 has a lower expression of both the subunits… Interestingly is a longitudinal sample and we can look at the expression of the genes along the samples. Also, are those cells misslabelled? or the expression of HBA1/2 is lower in all the cell types. To check it I am loading the original dataset and plot the expression of all the samples from this patient:

rm(list = ls())
gc()
##            used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells  6723903 359.1   21899943 1169.6  27374928 1462.0
## Vcells 33947399 259.0  123000667  938.5 153750833 1173.1
data <- readRDS("MM_atlas.Rds")
data <- subset(data, label %in% c("Late erythroid progenitor", "Aberrant erythroid"))
data <- NormalizeData(data)

VlnPlot(data, features = c("HBA1", "HBA2"), group.by = "label", ncol = 1)

VlnPlot(data, features = c("HBA1", "HBA2"), group.by = "bioprojects", split.by= "label", ncol = 1)

#VlnPlot(data, features = c("HBA1", "HBA2"), group.by = "bioprojects", ncol = 1)
data <- subset(data, subset = bioprojects == "PRJNA694128")
VlnPlot(data, features = c("HBA1", "HBA2"), group.by = "batch",  ncol = 1)

data <- subset(data, subset = batch == c("remission_BM_1", "RRMM_BM_1", "RRMM_BM_8", "PMM_CD138p_1"))
VlnPlot(data, features = c("HBA1", "HBA2"), group.by = "batch", ncol = 1)

Interestingly also the expression of HBA1/2 in other cell lines is lower in RR_BM_8, even looking at the other longitudinal data from the same patient